### LOAD DATA

# A) Current Period Papers (2009 & 2010)

load("Articles_2009-2010.gzip")

# B) Prior Year Papers (2008)

load("Articles_2008.gzip")

# C) Author Data

load("Authors.gzip")



### LISTS OF AUTHORS AND PAPERS

# A) Authors on Articles Published in 2009 & 2010

# Remove brackets:

temp <- sub(x=articles$authors, pattern="[", replacement="", fixed=T)
temp <- sub(x=temp, pattern="]", replacement="", fixed=T)

# Make list of authors on each paper:

authorlist <- strsplit(temp, "'")
authorlist <- lapply(authorlist, function(x) x[!x==""])
authorlist <- lapply(authorlist, function(x) x[!x==", "])

# Article identifiers:

names(authorlist) <- rownames(articles)

###

# B) Authors on Articles Published in 2008

# Remove brackets:

temp <- sub(x=articles2008$authors, pattern="[", replacement="", fixed=T)
temp <- sub(x=temp, pattern="]", replacement="", fixed=T)

# Make list of authors on each paper:

preauthorlist <- strsplit(temp, "'")
preauthorlist <- lapply(preauthorlist, function(x) x[!x==""])
preauthorlist <- lapply(preauthorlist, function(x) x[!x==", "])

# Article identifiers:

names(preauthorlist) <- rownames(articles2008)

###

# C) Full List of Active Researchers

### NOTE: The 'names' attribute has the author names.

# Number of papers per author (w/pub in 2009 & 2010):

authorcounts_full <- table(unlist(authorlist))

# Find authors with zero pubs in 2009 & 2010 who have pub in 2008:

temp1 <- sort(unique(unlist(preauthorlist)))

temp2 <- temp1[ !(temp1 %in% names(authorcounts_full)) ]

temp3 <- table(temp2) - 1

# Add them to the list:

temp4 <- c(authorcounts_full,temp3)

authorcounts_full <- temp4[order(names(temp4))]

# Number of authors/researchers:

N_auth_full <- length(authorcounts_full)

###

# D) List of Papers by Each Author

temp1 <- rep(rownames(articles), lengths(authorlist) )  
temp2 <- unlist(authorlist)

temp3 <- split(temp1, temp2)

paperlist_full <- rep(list(character(0)), length(authorcounts_full) )
  names(paperlist_full) <- names(authorcounts_full)

paperlist_full[names(temp3)] <- temp3

# Check variables:

all(authorcounts_full == lengths(paperlist_full) )

sum(lengths(authorlist))
sum(lengths(paperlist_full))
sum(authorcounts_full)



### ANALYTIC VARIABLES

# A) Authors' Prior Experience and Impact

all(names(authorlist) == rownames(articles) )

# Lists for each paper:

temp1 <- authors[unlist(authorlist), "pre_experience"]
temp2 <- authors[unlist(authorlist), "pre_avg_impact"]

temp1 <- relist(temp1, skeleton = authorlist)
temp2 <- relist(temp2, skeleton = authorlist)

# Add variables to articles table:

articles$num_authors <- lengths(authorlist)
articles$max_pre_experience <- sapply(temp1, max)
articles$avg_impact <- sapply(temp2, mean)

###

# B) Limit to Experienced Researchers

# Number of papers with an experienced researcher:

temp1 <- articles[unlist(paperlist_full), "max_pre_experience"] < K
temp2 <- relist(temp1, paperlist_full)
temp3 <- sapply(temp2, sum)

authorcounts <- authorcounts_full - temp3

# List of papers with an experienced researcher:

temp4 <- unlist(paperlist_full)
temp4[temp1] <- NA
temp5 <- relist(temp4, paperlist_full)

paperlist <- lapply(temp5, function(x) x[!is.na(x)])

# Check:

all(authorcounts == lengths(paperlist))

###

# C) External Connections

# Number of papers per coauthor on a paper:

coauthcounts <- relist(authorcounts[unlist(authorlist)], authorlist)

# Total number of projects by the team members (Z^prj):

articles$tot_proj <- sapply(coauthcounts, sum) 

# Make bipartite graph to papers:

library(Matrix)

temp1 <- rep(1:N_auth_full, authorcounts)
temp2 <- as.numeric(unlist(paperlist))

G_pap <- sparseMatrix(i=temp1, j=temp2)

# Check number of authors on each paper:

temp1 <- articles$max_pre_experience >= K
temp2 <- colSums(G_pap)[as.numeric(rownames(articles))] != lengths(authorlist)

# A handful of papers show two authors with same name:
# (treated as same person)

articles[temp1 & temp2,]

# Unweighted graph among authors:
# (operation works b/c G_pap is logical):

G_any <- G_pap %*% t(G_pap)

diag(G_any) <- FALSE

# Team degree (Z^deg):

temp1 <- t(G_pap) %*% G_any

temp2 <- temp1 * t(G_pap)

temp3 <- rowSums(temp1 - temp2)

articles$team_deg <- temp3[as.numeric(rownames(articles))]

###

# C) Researcher Skills

authors$skill_code <- as.character(NA)

# Discretized skills:

attach(authors)

authors$skill_code[pre_experience >= K & apmic > 0.5] <- "apm"
authors$skill_code[pre_experience >= K & methy > 0.5] <- "mty"
authors$skill_code[pre_experience >= K & macro > 0.5] <- "mac"
authors$skill_code[pre_experience >= K & bzfin > 0.5] <- "bfn"
authors$skill_code[pre_experience >= K & agloc > 0.5] <- "alo"
# (note: this leaves generalists as NA, along with inexperienced researchers)

detach(authors)

authors$skill_code <- as.factor(authors$skill_code)

# Indicator for generalists:

attach(authors)

general50 <- pre_experience >= K &
  apmic <= 0.5 & methy <= 0.5 & macro <= 0.5 & bzfin <= 0.5 & agloc <= 0.5

names(general50) <- rownames(authors)

detach(authors)

###

# D) Team Skills

# List of researcher skill codes on each team:

temp1 <- authors[unlist(authorlist),"skill_code"]
temp2 <- relist(temp1, skeleton=authorlist)

# Function to check if elements are all the same or not:

check <- function(x) length(unique(na.omit(x))) > 1

# Indicator for skill differences on team (X^dif):

articles$skill_diff <- sapply(temp2, check)

# Indicator for a generalist on team (X^gen):

temp1 <- general50[unlist(authorlist)]
temp2 <- relist(temp1, skeleton=authorlist)

articles$any_gen50 <- sapply(temp2, any)

# Indicators for authors with skill in each area:
# (used below to compute skill deficit)

temp1 <- authors[unlist(authorlist),"skill_code"] == "apm"
temp2 <- authors[unlist(authorlist),"skill_code"] == "mty"
temp3 <- authors[unlist(authorlist),"skill_code"] == "mac"
temp4 <- authors[unlist(authorlist),"skill_code"] == "bfn"
temp5 <- authors[unlist(authorlist),"skill_code"] == "alo"

# Reformat into list, by paper:

temp <- rep(1:length(authorlist), lengths(authorlist) )

temp1 <- split(temp1, temp)
temp2 <- split(temp2, temp)
temp3 <- split(temp3, temp)
temp4 <- split(temp4, temp)
temp5 <- split(temp5, temp)

# Numbers of authors with skill in each area:

total_skills <- matrix(nrow=nrow(articles), ncol=5)
	colnames(total_skills) <- c("apmic","methy","macro","bzfin","agloc") 

total_skills[,"apmic"] <- sapply(temp1, sum, na.rm=T)
total_skills[,"methy"] <- sapply(temp2, sum, na.rm=T)
total_skills[,"macro"] <- sapply(temp3, sum, na.rm=T)
total_skills[,"bzfin"] <- sapply(temp4, sum, na.rm=T)
total_skills[,"agloc"] <- sapply(temp5, sum, na.rm=T)

# Skill deficit (Z^def):

articles$skill_deficit <- rowSums( as.numeric(total_skills == 0) *
	articles[, c("apmic","methy","macro","bzfin","agloc")] )

# Number of topical areas:

articles$num_areas <- rowSums(
	articles[, c("apmic","methy","macro","bzfin","agloc")] > 0 )



### SAVE

# Clean up:

rm(temp, temp1, temp2, temp3, temp4, temp5, total_skills)

# Restrict main table to articles with an experienced author:

articles_full <- articles

articles <- articles[articles$max_pre_experience >= K,]

# Save:

save.image("xxx.RData")
